From 2c332fbc4be71314888cdf6ed4323855243f9610 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Feb 2021 13:53:40 -0800
Subject: [PATCH] [XLA/GPU] Use SequentialHloOrdering for single stream
 modules.

Also simplify tests in gpu_hlo_schedule_test.

PiperOrigin-RevId: 358025620
Change-Id: I1de2f85c91765e967865c6ea90bc629627146559
---
 .../compiler/xla/service/gpu/gpu_compiler.cc  |  8 +-
 .../xla/service/gpu/gpu_hlo_schedule.cc       | 25 ++++---
 .../xla/service/gpu/gpu_hlo_schedule.h        |  2 +-
 .../xla/service/gpu/gpu_hlo_schedule_test.cc  | 74 ++++++++++++++++---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 28 +------
 5 files changed, 86 insertions(+), 51 deletions(-)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 935c392dcee..d104c26964b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -520,9 +520,9 @@ GpuCompiler::RunHloPassesAndBufferAssignement(
 
   std::unique_ptr<StreamAssignment> stream_assignment =
       AssignStreams(*hlo_module);
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<GpuHloSchedule> hlo_schedule,
-                      GpuHloSchedule::Build(hlo_module.get(),
-                                            *stream_assignment, pointer_size_));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<GpuHloSchedule> hlo_schedule,
+      GpuHloSchedule::Build(*hlo_module, *stream_assignment, pointer_size_));
 
   auto buffer_size_bytes_function =
       [this](const BufferValue& buffer_value) -> int64 {
@@ -565,7 +565,7 @@ static Status CompileModuleToLlvmIrImpl(
       AssignStreams(*hlo_module);
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<GpuHloSchedule> hlo_schedule,
-      GpuHloSchedule::Build(hlo_module, *stream_assignment, pointer_size));
+      GpuHloSchedule::Build(*hlo_module, *stream_assignment, pointer_size));
 
   auto buffer_size_bytes_function =
       [pointer_size](const BufferValue& buffer_value) -> int64 {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
index 61c50622bed..1126943624a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
@@ -190,29 +190,30 @@ GpuHloSchedule::GpuHloSchedule() {}
 
 /* static */
 StatusOr<std::unique_ptr<GpuHloSchedule>> GpuHloSchedule::Build(
-    HloModule* module, const StreamAssignment& stream_assignment,
+    const HloModule& module, const StreamAssignment& stream_assignment,
     int64 pointer_size) {
   std::unique_ptr<GpuHloSchedule> schedule(new GpuHloSchedule);
 
   // Initialize thunk_launch_order_, the total order of thunk launches.
-  HloComputation* entry_computation = module->entry_computation();
+  HloComputation* entry_computation = module.entry_computation();
   if (stream_assignment.StreamCount() == 1) {
+    // All kernels are launched on a single stream, so there's no loss of
+    // concurrency by optimizing for minimal memory usage.
     TF_ASSIGN_OR_RETURN(
-        HloSchedule sequences,
-        ScheduleModule(module, [pointer_size](const BufferValue& buffer) {
-          return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
-        }));
-    schedule->thunk_launch_order_ =
-        sequences.sequence(entry_computation).instructions();
-    schedule->hlo_ordering_ =
-        absl::make_unique<SequentialHloOrdering>(sequences);
+        HloInstructionSequence sequence,
+        ScheduleComputation(
+            entry_computation, [pointer_size](const BufferValue& buffer) {
+              return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
+            }));
+    schedule->thunk_launch_order_ = sequence.instructions();
   } else {
     // BFS tends to increase concurrency, but also increases memory usage.
     BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_);
-    schedule->hlo_ordering_ = absl::make_unique<GpuHloOrdering>(
-        module, stream_assignment, schedule->thunk_launch_order_);
   }
 
+  schedule->hlo_ordering_ = absl::make_unique<GpuHloOrdering>(
+      &module, stream_assignment, schedule->thunk_launch_order_);
+
   return std::move(schedule);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
index c1493cee3ce..7f224ffe4f0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
@@ -41,7 +41,7 @@ class GpuHloSchedule {
   // Constructs an GpuHloSchedule for the given module, based on the given
   // stream assignment.
   static StatusOr<std::unique_ptr<GpuHloSchedule>> Build(
-      HloModule* module, const StreamAssignment& stream_assignment,
+      const HloModule& module, const StreamAssignment& stream_assignment,
       int64 pointer_size);
 
   // Returns the total order of thunk launches, represented in terms of HLO
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
index 82cfd688380..91db7151f22 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -39,7 +39,7 @@ class GpuHloScheduleTest : public HloTestBase {
   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
 
   static std::unique_ptr<GpuHloSchedule> BuildGpuHloSchedule(
-      HloModule* module, const StreamAssignment& streams) {
+      const HloModule& module, const StreamAssignment& streams) {
     return GpuHloSchedule::Build(module, streams, /*pointer_size=*/8)
         .ConsumeValueOrDie();
   }
@@ -86,7 +86,7 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
   EXPECT_EQ(streams->StreamNumberForHlo(*dot1),
             streams->StreamNumberForHlo(*dot2));
 
-  auto schedule = BuildGpuHloSchedule(module.get(), *streams);
+  auto schedule = BuildGpuHloSchedule(*module, *streams);
   // Remove parameters, which are unordered.
   EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
             HloVec({dot1, dot2}));
@@ -94,10 +94,32 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
   // Parameters x,y,z are mutually unordered, while dot1 and dot2 are
   // transitively ordered by operands.
   auto order = schedule->ConsumeHloOrdering();
-  EXPECT_TRUE(order->ExecutesBefore(z, y));
-  EXPECT_TRUE(order->ExecutesBefore(y, x));
   EXPECT_TRUE(order->ExecutesBefore(x, dot1));
+  EXPECT_TRUE(order->ExecutesBefore(x, dot2));
+  EXPECT_TRUE(order->ExecutesBefore(y, dot1));
+  EXPECT_TRUE(order->ExecutesBefore(y, dot2));
+  EXPECT_TRUE(order->ExecutesBefore(z, dot2));
   EXPECT_TRUE(order->ExecutesBefore(dot1, dot2));
+
+  EXPECT_FALSE(order->ExecutesBefore(x, x));
+  EXPECT_FALSE(order->ExecutesBefore(x, y));
+  EXPECT_FALSE(order->ExecutesBefore(x, z));
+  EXPECT_FALSE(order->ExecutesBefore(y, x));
+  EXPECT_FALSE(order->ExecutesBefore(y, y));
+  EXPECT_FALSE(order->ExecutesBefore(y, z));
+  EXPECT_FALSE(order->ExecutesBefore(z, x));
+  EXPECT_FALSE(order->ExecutesBefore(z, y));
+  EXPECT_FALSE(order->ExecutesBefore(z, z));
+  EXPECT_FALSE(order->ExecutesBefore(z, dot1));
+  EXPECT_FALSE(order->ExecutesBefore(dot1, x));
+  EXPECT_FALSE(order->ExecutesBefore(dot1, y));
+  EXPECT_FALSE(order->ExecutesBefore(dot1, z));
+  EXPECT_FALSE(order->ExecutesBefore(dot1, dot1));
+  EXPECT_FALSE(order->ExecutesBefore(dot2, x));
+  EXPECT_FALSE(order->ExecutesBefore(dot2, y));
+  EXPECT_FALSE(order->ExecutesBefore(dot2, z));
+  EXPECT_FALSE(order->ExecutesBefore(dot2, dot1));
+  EXPECT_FALSE(order->ExecutesBefore(dot2, dot2));
 }
 
 // Test of a single stream, where data dependencies do not fully determine the
@@ -126,7 +148,7 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
   EXPECT_EQ(streams->StreamNumberForHlo(*add1),
             streams->StreamNumberForHlo(*add3));
 
-  auto schedule = BuildGpuHloSchedule(module.get(), *streams);
+  auto schedule = BuildGpuHloSchedule(*module, *streams);
   // Remove parameters, which are unordered.
   EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
             HloVec({add1, add2, add3}));
@@ -134,11 +156,43 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
   // Parameters x,y,z are mutually unordered, while add1, add2 and add3 are
   // transitively ordered by operands.
   auto order = schedule->ConsumeHloOrdering();
-  EXPECT_TRUE(order->ExecutesBefore(y, z));
-  EXPECT_TRUE(order->ExecutesBefore(z, x));
   EXPECT_TRUE(order->ExecutesBefore(x, add1));
-  EXPECT_TRUE(order->ExecutesBefore(add1, add2));
+  EXPECT_TRUE(order->ExecutesBefore(x, add2));
+  EXPECT_TRUE(order->ExecutesBefore(x, add3));
+  EXPECT_TRUE(order->ExecutesBefore(y, add1));
+  EXPECT_TRUE(order->ExecutesBefore(y, add2));
+  EXPECT_TRUE(order->ExecutesBefore(y, add3));
+  EXPECT_TRUE(order->ExecutesBefore(z, add2));
+  EXPECT_TRUE(order->ExecutesBefore(z, add3));
+  EXPECT_TRUE(order->ExecutesBefore(add1, add3));
   EXPECT_TRUE(order->ExecutesBefore(add2, add3));
+  // The HLO graph does not define an ordering for add1 and add2, but their
+  // assignment onto the same stream does define an ordering.
+  if (order->ExecutesBefore(add1, add2)) {
+    EXPECT_FALSE(order->ExecutesBefore(add2, add1));
+  } else {
+    EXPECT_TRUE(order->ExecutesBefore(add2, add1));
+    EXPECT_FALSE(order->ExecutesBefore(add1, add2));
+  }
+
+  EXPECT_FALSE(order->ExecutesBefore(x, x));
+  EXPECT_FALSE(order->ExecutesBefore(x, y));
+  EXPECT_FALSE(order->ExecutesBefore(x, z));
+  EXPECT_FALSE(order->ExecutesBefore(y, x));
+  EXPECT_FALSE(order->ExecutesBefore(y, y));
+  EXPECT_FALSE(order->ExecutesBefore(y, z));
+  EXPECT_FALSE(order->ExecutesBefore(z, x));
+  EXPECT_FALSE(order->ExecutesBefore(z, y));
+  EXPECT_FALSE(order->ExecutesBefore(z, z));
+  EXPECT_FALSE(order->ExecutesBefore(z, add1));
+  EXPECT_FALSE(order->ExecutesBefore(add1, x));
+  EXPECT_FALSE(order->ExecutesBefore(add1, y));
+  EXPECT_FALSE(order->ExecutesBefore(add1, z));
+  EXPECT_FALSE(order->ExecutesBefore(add1, add1));
+  EXPECT_FALSE(order->ExecutesBefore(add2, x));
+  EXPECT_FALSE(order->ExecutesBefore(add2, y));
+  EXPECT_FALSE(order->ExecutesBefore(add2, z));
+  EXPECT_FALSE(order->ExecutesBefore(add2, add2));
 }
 
 // Test of two streams.
@@ -162,7 +216,7 @@ TEST_F(GpuHloScheduleTest, ConcurrentMatMul) {
   EXPECT_NE(streams->StreamNumberForHlo(*dot1),
             streams->StreamNumberForHlo(*dot2));
 
-  auto schedule = BuildGpuHloSchedule(module.get(), *streams);
+  auto schedule = BuildGpuHloSchedule(*module, *streams);
   // Remove parameters, which are unordered.
   HloVec thunk_launch_order = RemoveHlo(schedule->ThunkLaunchOrder(), {x, y});
   EXPECT_TRUE(thunk_launch_order == HloVec({dot1, dot2, add}) ||
@@ -254,7 +308,7 @@ TEST_F(GpuHloScheduleTest, LatticeMatMul) {
 
   // We don't check the thunk launch order, since there are many valid total
   // orders, and it's annoying to express.
-  auto schedule = BuildGpuHloSchedule(module.get(), *streams);
+  auto schedule = BuildGpuHloSchedule(*module, *streams);
 
   auto order = schedule->ConsumeHloOrdering();
   const HloVec all_params(
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 31cf4e74f32..349aee35289 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -3829,20 +3829,6 @@ Status CheckConditionalBuffersShareAllocation(
   return Status::OK();
 }
 
-Status AcceptMaybeOrdered(HloComputation* computation,
-                          IrEmitterUnnested* emitter,
-                          const BufferAssignment& buffer_assignment) {
-  const auto& debug_options = computation->parent()->config().debug_options();
-  if (debug_options.xla_gpu_disable_multi_streaming()) {
-    const HloInstructionSequence* sequence =
-        buffer_assignment.hlo_ordering().SequentialOrder(*computation);
-    // Always expect a sequential ordering for single-stream programs.
-    TF_RET_CHECK(sequence);
-    return computation->AcceptOrdered(emitter, sequence->instructions());
-  }
-  return computation->Accept(emitter);
-}
-
 }  // namespace
 
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
@@ -3856,18 +3842,14 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
   TF_ASSIGN_OR_RETURN(auto ir_emitter_condition,
                       IrEmitterUnnested::Create(hlo_module_config_, condition,
                                                 ir_emitter_context_));
-
-  TF_RETURN_IF_ERROR(
-      AcceptMaybeOrdered(condition, ir_emitter_condition.get(),
-                         ir_emitter_context_->buffer_assignment()));
+  TF_RETURN_IF_ERROR(condition->Accept(ir_emitter_condition.get()));
 
   // Generate thunk sequence for while 'body'.
   HloComputation* body = hlo->while_body();
   TF_ASSIGN_OR_RETURN(
       auto ir_emitter_body,
       IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
-  TF_RETURN_IF_ERROR(AcceptMaybeOrdered(
-      body, ir_emitter_body.get(), ir_emitter_context_->buffer_assignment()));
+  TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
 
   const auto* index_map = ir_emitter_context_->profile_index_map();
   absl::optional<size_t> condition_profile_index, body_profile_index;
@@ -3895,8 +3877,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildForThunk(
   TF_ASSIGN_OR_RETURN(
       auto ir_emitter_body,
       IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
-  TF_RETURN_IF_ERROR(AcceptMaybeOrdered(
-      body, ir_emitter_body.get(), ir_emitter_context_->buffer_assignment()));
+  TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
 
   const auto* index_map = ir_emitter_context_->profile_index_map();
   absl::optional<size_t> body_profile_index;
@@ -3933,8 +3914,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildConditionalThunk(
         auto ir_emitter,
         IrEmitterUnnested::Create(hlo_module_config_, branch_computation,
                                   ir_emitter_context_));
-    TF_CHECK_OK(AcceptMaybeOrdered(branch_computation, ir_emitter.get(),
-                                   ir_emitter_context_->buffer_assignment()));
+    TF_CHECK_OK(branch_computation->Accept(ir_emitter.get()));
     branch_thunks.push_back(std::move(*ir_emitter->ConsumeThunkSequence()));
 
     absl::optional<size_t> profile_index;