From 82e12bf3876a68a5c1cafe8bb622fb61e393e149 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Tue, 7 Jul 2020 20:55:32 -0700
Subject: [PATCH] [XLA/GPU] Decouple hlo_ordering from thunk_schedule.

The plumbing before this goes like this:
  * hlo_ordering -> buffer_assignment
  * buffer_assignment -> ir_emitter_unnested (DFS order) -> thunks
  * Apply hlo_ordering to thunks -> thunk_schedule

After:
  * hlo_ordering -> buffer_assignment
  * buffer_assignment -> ir_emitter_unnested (hlo_ordering) -> thunks
  * thunks -> thunk_schedule (order unchanged)

The idea is that since thunks are scheduled to the a certain total order anyway, just use that order to invoke the emitter. It saves an extra schedule, but most importantly, it removes uses of Thunk::hlo_instruction(), which makes MLIR/GPU transition easier.

PiperOrigin-RevId: 320117281
Change-Id: I0ee9ff14e71869ea09d6223ae10448317298096f
---
 tensorflow/compiler/xla/service/gpu/gpu_compiler.cc |  8 +++++---
 .../compiler/xla/service/gpu/thunk_schedule.cc      | 13 +++++--------
 .../compiler/xla/service/gpu/thunk_schedule.h       |  3 +--
 .../xla/service/mlir_gpu/lhlo_dialect_emitter.cc    |  6 ++++--
 .../xla/service/mlir_gpu/lhlo_dialect_emitter.h     |  3 ++-
 .../xla/service/mlir_gpu/mlir_compiler_impl.cc      |  6 +++---
 6 files changed, 20 insertions(+), 19 deletions(-)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 2b31099d26f..f5ed7e3a114 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -518,8 +518,11 @@ static Status CompileModuleToLlvmIrImpl(
 
   {
     XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
-    TF_RETURN_IF_ERROR(entry_computation->Accept(&ir_emitter));
+    TF_RETURN_IF_ERROR(entry_computation->AcceptOrdered(
+        &ir_emitter, (*hlo_schedule)->ThunkLaunchOrder()));
   }
+  // The order of `thunk_sequence` corresponds to
+  // `hlo_schedule->ThunkLaunchOrder()`.
   *thunk_sequence = ir_emitter.ConsumeThunkSequence();
   return Status::OK();
 }
@@ -610,8 +613,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
                                           gpu_version, stream_exec));
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
-      std::move(thunk_sequence), std::move(stream_assignment),
-      hlo_schedule->ThunkLaunchOrder());
+      std::move(thunk_sequence), std::move(stream_assignment));
   if (DumpingEnabledForHloModule(*module)) {
     DumpToFileInDirOrStdout(*module, "", "thunk_schedule",
                             thunk_schedule->ToString());
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index daa5f33e560..a91466e5c5f 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -49,21 +49,18 @@ void ThunkSchedule::AddDependenciesOnTransitiveOperands(
 
 ThunkSchedule::ThunkSchedule(
     std::unique_ptr<ThunkSequence> thunks,
-    std::unique_ptr<StreamAssignment> stream_assignment,
-    const std::vector<HloInstruction*>& hlo_total_order)
+    std::unique_ptr<StreamAssignment> stream_assignment)
     : thunks_(std::move(thunks)),
       stream_assignment_(std::move(stream_assignment)) {
+  for (auto& thunk : *thunks_) {
+    thunk_total_order_.push_back(thunk.get());
+  }
+
   absl::flat_hash_map<const HloInstruction*, Thunk*> hlo_to_thunk;
   for (const auto& thunk : *thunks_) {
     InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get());
   }
 
-  for (HloInstruction* hlo : hlo_total_order) {
-    if (Thunk** thunk = tensorflow::gtl::FindOrNull(hlo_to_thunk, hlo)) {
-      thunk_total_order_.push_back(*thunk);
-    }
-  }
-
   for (const Thunk* thunk : thunk_total_order_) {
     const auto* dst = thunk->hlo_instruction();
     CHECK(stream_assignment_->HasStreamAssigned(*dst));
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
index 549378debd5..73da708aa3d 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
@@ -47,8 +47,7 @@ namespace gpu {
 class ThunkSchedule {
  public:
   ThunkSchedule(std::unique_ptr<ThunkSequence> thunks,
-                std::unique_ptr<StreamAssignment> stream_assignment,
-                const std::vector<HloInstruction*>& hlo_total_order);
+                std::unique_ptr<StreamAssignment> stream_assignment);
 
   // Returns the total order of executing all the thunks.
   const std::vector<Thunk*>& TotalOrder() const { return thunk_total_order_; }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index a65096b7eac..3654271da53 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -226,8 +226,10 @@ absl::string_view LhloDialectEmitter::platform_name() const {
   return platform_->Name();
 }
 
-Status LhloDialectEmitter::EmitComputation(const HloComputation& computation) {
-  return computation.root_instruction()->Accept(this);
+Status LhloDialectEmitter::EmitComputation(
+    const HloComputation& computation,
+    absl::Span<HloInstruction* const> ordering) {
+  return computation.AcceptOrdered(this, ordering);
 }
 
 StatusOr<FuncOp> LhloDialectEmitter::CreateFunction(
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
index 185c1e13bb7..2fe1947d625 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h
@@ -47,7 +47,8 @@ class LhloDialectEmitter : public DfsHloVisitorWithDefault,
                      ::mlir::ModuleOp mlir_module);
   ~LhloDialectEmitter() override = default;
 
-  Status EmitComputation(const HloComputation& computation);
+  Status EmitComputation(const HloComputation& computation,
+                         absl::Span<HloInstruction* const> ordering);
 
   // The following methods implement the DfsHloVisitor interface.
   //
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index beabc99a173..a2bee43a0f8 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -489,7 +489,8 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
                                   stream_exec->platform(), *mlir_module);
 
   TF_RETURN_IF_ERROR(lhlo_emitter.EmitComputation(
-      *emission_context.getHloModule()->entry_computation()));
+      *emission_context.getHloModule()->entry_computation(),
+      hlo_schedule->ThunkLaunchOrder()));
 
   TF_RETURN_IF_ERROR(
       module_hook_.invoke(IRHook::LoweringStage::LHLO, *mlir_module));
@@ -539,8 +540,7 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
                                     gpu::PtxOptsFromConfig(config)));
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
-      std::move(thunk_sequence), std::move(stream_assignment),
-      hlo_schedule->ThunkLaunchOrder());
+      std::move(thunk_sequence), std::move(stream_assignment));
 
   if (DumpingEnabledForHloModule(*emission_context.getHloModule())) {
     DumpToFileInDirOrStdout(*emission_context.getHloModule(), "",