From 2c332fbc4be71314888cdf6ed4323855243f9610 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 17 Feb 2021 13:53:40 -0800 Subject: [PATCH] [XLA/GPU] Use SequentialHloOrdering for single stream modules. Also simplify tests in gpu_hlo_schedule_test. PiperOrigin-RevId: 358025620 Change-Id: I1de2f85c91765e967865c6ea90bc629627146559 --- .../compiler/xla/service/gpu/gpu_compiler.cc | 8 +- .../xla/service/gpu/gpu_hlo_schedule.cc | 25 ++++--- .../xla/service/gpu/gpu_hlo_schedule.h | 2 +- .../xla/service/gpu/gpu_hlo_schedule_test.cc | 74 ++++++++++++++++--- .../xla/service/gpu/ir_emitter_unnested.cc | 28 +------ 5 files changed, 86 insertions(+), 51 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 935c392dcee..d104c26964b 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -520,9 +520,9 @@ GpuCompiler::RunHloPassesAndBufferAssignement( std::unique_ptr stream_assignment = AssignStreams(*hlo_module); - TF_ASSIGN_OR_RETURN(std::unique_ptr hlo_schedule, - GpuHloSchedule::Build(hlo_module.get(), - *stream_assignment, pointer_size_)); + TF_ASSIGN_OR_RETURN( + std::unique_ptr hlo_schedule, + GpuHloSchedule::Build(*hlo_module, *stream_assignment, pointer_size_)); auto buffer_size_bytes_function = [this](const BufferValue& buffer_value) -> int64 { @@ -565,7 +565,7 @@ static Status CompileModuleToLlvmIrImpl( AssignStreams(*hlo_module); TF_ASSIGN_OR_RETURN( std::unique_ptr hlo_schedule, - GpuHloSchedule::Build(hlo_module, *stream_assignment, pointer_size)); + GpuHloSchedule::Build(*hlo_module, *stream_assignment, pointer_size)); auto buffer_size_bytes_function = [pointer_size](const BufferValue& buffer_value) -> int64 { diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc index 61c50622bed..1126943624a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc @@ -190,29 +190,30 @@ GpuHloSchedule::GpuHloSchedule() {} /* static */ StatusOr> GpuHloSchedule::Build( - HloModule* module, const StreamAssignment& stream_assignment, + const HloModule& module, const StreamAssignment& stream_assignment, int64 pointer_size) { std::unique_ptr schedule(new GpuHloSchedule); // Initialize thunk_launch_order_, the total order of thunk launches. - HloComputation* entry_computation = module->entry_computation(); + HloComputation* entry_computation = module.entry_computation(); if (stream_assignment.StreamCount() == 1) { + // All kernels are launched on a single stream, so there's no loss of + // concurrency by optimizing for minimal memory usage. TF_ASSIGN_OR_RETURN( - HloSchedule sequences, - ScheduleModule(module, [pointer_size](const BufferValue& buffer) { - return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size); - })); - schedule->thunk_launch_order_ = - sequences.sequence(entry_computation).instructions(); - schedule->hlo_ordering_ = - absl::make_unique(sequences); + HloInstructionSequence sequence, + ScheduleComputation( + entry_computation, [pointer_size](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size); + })); + schedule->thunk_launch_order_ = sequence.instructions(); } else { // BFS tends to increase concurrency, but also increases memory usage. BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_); - schedule->hlo_ordering_ = absl::make_unique( - module, stream_assignment, schedule->thunk_launch_order_); } + schedule->hlo_ordering_ = absl::make_unique( + &module, stream_assignment, schedule->thunk_launch_order_); + return std::move(schedule); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h index c1493cee3ce..7f224ffe4f0 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h @@ -41,7 +41,7 @@ class GpuHloSchedule { // Constructs an GpuHloSchedule for the given module, based on the given // stream assignment. static StatusOr> Build( - HloModule* module, const StreamAssignment& stream_assignment, + const HloModule& module, const StreamAssignment& stream_assignment, int64 pointer_size); // Returns the total order of thunk launches, represented in terms of HLO diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc index 82cfd688380..91db7151f22 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc @@ -39,7 +39,7 @@ class GpuHloScheduleTest : public HloTestBase { Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2}); static std::unique_ptr BuildGpuHloSchedule( - HloModule* module, const StreamAssignment& streams) { + const HloModule& module, const StreamAssignment& streams) { return GpuHloSchedule::Build(module, streams, /*pointer_size=*/8) .ConsumeValueOrDie(); } @@ -86,7 +86,7 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) { EXPECT_EQ(streams->StreamNumberForHlo(*dot1), streams->StreamNumberForHlo(*dot2)); - auto schedule = BuildGpuHloSchedule(module.get(), *streams); + auto schedule = BuildGpuHloSchedule(*module, *streams); // Remove parameters, which are unordered. EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}), HloVec({dot1, dot2})); @@ -94,10 +94,32 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) { // Parameters x,y,z are mutually unordered, while dot1 and dot2 are // transitively ordered by operands. auto order = schedule->ConsumeHloOrdering(); - EXPECT_TRUE(order->ExecutesBefore(z, y)); - EXPECT_TRUE(order->ExecutesBefore(y, x)); EXPECT_TRUE(order->ExecutesBefore(x, dot1)); + EXPECT_TRUE(order->ExecutesBefore(x, dot2)); + EXPECT_TRUE(order->ExecutesBefore(y, dot1)); + EXPECT_TRUE(order->ExecutesBefore(y, dot2)); + EXPECT_TRUE(order->ExecutesBefore(z, dot2)); EXPECT_TRUE(order->ExecutesBefore(dot1, dot2)); + + EXPECT_FALSE(order->ExecutesBefore(x, x)); + EXPECT_FALSE(order->ExecutesBefore(x, y)); + EXPECT_FALSE(order->ExecutesBefore(x, z)); + EXPECT_FALSE(order->ExecutesBefore(y, x)); + EXPECT_FALSE(order->ExecutesBefore(y, y)); + EXPECT_FALSE(order->ExecutesBefore(y, z)); + EXPECT_FALSE(order->ExecutesBefore(z, x)); + EXPECT_FALSE(order->ExecutesBefore(z, y)); + EXPECT_FALSE(order->ExecutesBefore(z, z)); + EXPECT_FALSE(order->ExecutesBefore(z, dot1)); + EXPECT_FALSE(order->ExecutesBefore(dot1, x)); + EXPECT_FALSE(order->ExecutesBefore(dot1, y)); + EXPECT_FALSE(order->ExecutesBefore(dot1, z)); + EXPECT_FALSE(order->ExecutesBefore(dot1, dot1)); + EXPECT_FALSE(order->ExecutesBefore(dot2, x)); + EXPECT_FALSE(order->ExecutesBefore(dot2, y)); + EXPECT_FALSE(order->ExecutesBefore(dot2, z)); + EXPECT_FALSE(order->ExecutesBefore(dot2, dot1)); + EXPECT_FALSE(order->ExecutesBefore(dot2, dot2)); } // Test of a single stream, where data dependencies do not fully determine the @@ -126,7 +148,7 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) { EXPECT_EQ(streams->StreamNumberForHlo(*add1), streams->StreamNumberForHlo(*add3)); - auto schedule = BuildGpuHloSchedule(module.get(), *streams); + auto schedule = BuildGpuHloSchedule(*module, *streams); // Remove parameters, which are unordered. EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}), HloVec({add1, add2, add3})); @@ -134,11 +156,43 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) { // Parameters x,y,z are mutually unordered, while add1, add2 and add3 are // transitively ordered by operands. auto order = schedule->ConsumeHloOrdering(); - EXPECT_TRUE(order->ExecutesBefore(y, z)); - EXPECT_TRUE(order->ExecutesBefore(z, x)); EXPECT_TRUE(order->ExecutesBefore(x, add1)); - EXPECT_TRUE(order->ExecutesBefore(add1, add2)); + EXPECT_TRUE(order->ExecutesBefore(x, add2)); + EXPECT_TRUE(order->ExecutesBefore(x, add3)); + EXPECT_TRUE(order->ExecutesBefore(y, add1)); + EXPECT_TRUE(order->ExecutesBefore(y, add2)); + EXPECT_TRUE(order->ExecutesBefore(y, add3)); + EXPECT_TRUE(order->ExecutesBefore(z, add2)); + EXPECT_TRUE(order->ExecutesBefore(z, add3)); + EXPECT_TRUE(order->ExecutesBefore(add1, add3)); EXPECT_TRUE(order->ExecutesBefore(add2, add3)); + // The HLO graph does not define an ordering for add1 and add2, but their + // assignment onto the same stream does define an ordering. + if (order->ExecutesBefore(add1, add2)) { + EXPECT_FALSE(order->ExecutesBefore(add2, add1)); + } else { + EXPECT_TRUE(order->ExecutesBefore(add2, add1)); + EXPECT_FALSE(order->ExecutesBefore(add1, add2)); + } + + EXPECT_FALSE(order->ExecutesBefore(x, x)); + EXPECT_FALSE(order->ExecutesBefore(x, y)); + EXPECT_FALSE(order->ExecutesBefore(x, z)); + EXPECT_FALSE(order->ExecutesBefore(y, x)); + EXPECT_FALSE(order->ExecutesBefore(y, y)); + EXPECT_FALSE(order->ExecutesBefore(y, z)); + EXPECT_FALSE(order->ExecutesBefore(z, x)); + EXPECT_FALSE(order->ExecutesBefore(z, y)); + EXPECT_FALSE(order->ExecutesBefore(z, z)); + EXPECT_FALSE(order->ExecutesBefore(z, add1)); + EXPECT_FALSE(order->ExecutesBefore(add1, x)); + EXPECT_FALSE(order->ExecutesBefore(add1, y)); + EXPECT_FALSE(order->ExecutesBefore(add1, z)); + EXPECT_FALSE(order->ExecutesBefore(add1, add1)); + EXPECT_FALSE(order->ExecutesBefore(add2, x)); + EXPECT_FALSE(order->ExecutesBefore(add2, y)); + EXPECT_FALSE(order->ExecutesBefore(add2, z)); + EXPECT_FALSE(order->ExecutesBefore(add2, add2)); } // Test of two streams. @@ -162,7 +216,7 @@ TEST_F(GpuHloScheduleTest, ConcurrentMatMul) { EXPECT_NE(streams->StreamNumberForHlo(*dot1), streams->StreamNumberForHlo(*dot2)); - auto schedule = BuildGpuHloSchedule(module.get(), *streams); + auto schedule = BuildGpuHloSchedule(*module, *streams); // Remove parameters, which are unordered. HloVec thunk_launch_order = RemoveHlo(schedule->ThunkLaunchOrder(), {x, y}); EXPECT_TRUE(thunk_launch_order == HloVec({dot1, dot2, add}) || @@ -254,7 +308,7 @@ TEST_F(GpuHloScheduleTest, LatticeMatMul) { // We don't check the thunk launch order, since there are many valid total // orders, and it's annoying to express. - auto schedule = BuildGpuHloSchedule(module.get(), *streams); + auto schedule = BuildGpuHloSchedule(*module, *streams); auto order = schedule->ConsumeHloOrdering(); const HloVec all_params( diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 31cf4e74f32..349aee35289 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3829,20 +3829,6 @@ Status CheckConditionalBuffersShareAllocation( return Status::OK(); } -Status AcceptMaybeOrdered(HloComputation* computation, - IrEmitterUnnested* emitter, - const BufferAssignment& buffer_assignment) { - const auto& debug_options = computation->parent()->config().debug_options(); - if (debug_options.xla_gpu_disable_multi_streaming()) { - const HloInstructionSequence* sequence = - buffer_assignment.hlo_ordering().SequentialOrder(*computation); - // Always expect a sequential ordering for single-stream programs. - TF_RET_CHECK(sequence); - return computation->AcceptOrdered(emitter, sequence->instructions()); - } - return computation->Accept(emitter); -} - } // namespace StatusOr> IrEmitterUnnested::BuildWhileThunk( @@ -3856,18 +3842,14 @@ StatusOr> IrEmitterUnnested::BuildWhileThunk( TF_ASSIGN_OR_RETURN(auto ir_emitter_condition, IrEmitterUnnested::Create(hlo_module_config_, condition, ir_emitter_context_)); - - TF_RETURN_IF_ERROR( - AcceptMaybeOrdered(condition, ir_emitter_condition.get(), - ir_emitter_context_->buffer_assignment())); + TF_RETURN_IF_ERROR(condition->Accept(ir_emitter_condition.get())); // Generate thunk sequence for while 'body'. HloComputation* body = hlo->while_body(); TF_ASSIGN_OR_RETURN( auto ir_emitter_body, IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_)); - TF_RETURN_IF_ERROR(AcceptMaybeOrdered( - body, ir_emitter_body.get(), ir_emitter_context_->buffer_assignment())); + TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get())); const auto* index_map = ir_emitter_context_->profile_index_map(); absl::optional condition_profile_index, body_profile_index; @@ -3895,8 +3877,7 @@ StatusOr> IrEmitterUnnested::BuildForThunk( TF_ASSIGN_OR_RETURN( auto ir_emitter_body, IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_)); - TF_RETURN_IF_ERROR(AcceptMaybeOrdered( - body, ir_emitter_body.get(), ir_emitter_context_->buffer_assignment())); + TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get())); const auto* index_map = ir_emitter_context_->profile_index_map(); absl::optional body_profile_index; @@ -3933,8 +3914,7 @@ StatusOr> IrEmitterUnnested::BuildConditionalThunk( auto ir_emitter, IrEmitterUnnested::Create(hlo_module_config_, branch_computation, ir_emitter_context_)); - TF_CHECK_OK(AcceptMaybeOrdered(branch_computation, ir_emitter.get(), - ir_emitter_context_->buffer_assignment())); + TF_CHECK_OK(branch_computation->Accept(ir_emitter.get())); branch_thunks.push_back(std::move(*ir_emitter->ConsumeThunkSequence())); absl::optional profile_index;