From 82e12bf3876a68a5c1cafe8bb622fb61e393e149 Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Tue, 7 Jul 2020 20:55:32 -0700 Subject: [PATCH] [XLA/GPU] Decouple hlo_ordering from thunk_schedule. The plumbing before this goes like this: * hlo_ordering -> buffer_assignment * buffer_assignment -> ir_emitter_unnested (DFS order) -> thunks * Apply hlo_ordering to thunks -> thunk_schedule After: * hlo_ordering -> buffer_assignment * buffer_assignment -> ir_emitter_unnested (hlo_ordering) -> thunks * thunks -> thunk_schedule (order unchanged) The idea is that since thunks are scheduled to the a certain total order anyway, just use that order to invoke the emitter. It saves an extra schedule, but most importantly, it removes uses of Thunk::hlo_instruction(), which makes MLIR/GPU transition easier. PiperOrigin-RevId: 320117281 Change-Id: I0ee9ff14e71869ea09d6223ae10448317298096f --- tensorflow/compiler/xla/service/gpu/gpu_compiler.cc | 8 +++++--- .../compiler/xla/service/gpu/thunk_schedule.cc | 13 +++++-------- .../compiler/xla/service/gpu/thunk_schedule.h | 3 +-- .../xla/service/mlir_gpu/lhlo_dialect_emitter.cc | 6 ++++-- .../xla/service/mlir_gpu/lhlo_dialect_emitter.h | 3 ++- .../xla/service/mlir_gpu/mlir_compiler_impl.cc | 6 +++--- 6 files changed, 20 insertions(+), 19 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 2b31099d26f..f5ed7e3a114 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -518,8 +518,11 @@ static Status CompileModuleToLlvmIrImpl( { XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission"); - TF_RETURN_IF_ERROR(entry_computation->Accept(&ir_emitter)); + TF_RETURN_IF_ERROR(entry_computation->AcceptOrdered( + &ir_emitter, (*hlo_schedule)->ThunkLaunchOrder())); } + // The order of `thunk_sequence` corresponds to + // `hlo_schedule->ThunkLaunchOrder()`. *thunk_sequence = ir_emitter.ConsumeThunkSequence(); return Status::OK(); } @@ -610,8 +613,7 @@ StatusOr> GpuCompiler::RunBackend( gpu_version, stream_exec)); auto thunk_schedule = absl::make_unique( - std::move(thunk_sequence), std::move(stream_assignment), - hlo_schedule->ThunkLaunchOrder()); + std::move(thunk_sequence), std::move(stream_assignment)); if (DumpingEnabledForHloModule(*module)) { DumpToFileInDirOrStdout(*module, "", "thunk_schedule", thunk_schedule->ToString()); diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc index daa5f33e560..a91466e5c5f 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc +++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc @@ -49,21 +49,18 @@ void ThunkSchedule::AddDependenciesOnTransitiveOperands( ThunkSchedule::ThunkSchedule( std::unique_ptr thunks, - std::unique_ptr stream_assignment, - const std::vector& hlo_total_order) + std::unique_ptr stream_assignment) : thunks_(std::move(thunks)), stream_assignment_(std::move(stream_assignment)) { + for (auto& thunk : *thunks_) { + thunk_total_order_.push_back(thunk.get()); + } + absl::flat_hash_map hlo_to_thunk; for (const auto& thunk : *thunks_) { InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get()); } - for (HloInstruction* hlo : hlo_total_order) { - if (Thunk** thunk = tensorflow::gtl::FindOrNull(hlo_to_thunk, hlo)) { - thunk_total_order_.push_back(*thunk); - } - } - for (const Thunk* thunk : thunk_total_order_) { const auto* dst = thunk->hlo_instruction(); CHECK(stream_assignment_->HasStreamAssigned(*dst)); diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h index 549378debd5..73da708aa3d 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h +++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h @@ -47,8 +47,7 @@ namespace gpu { class ThunkSchedule { public: ThunkSchedule(std::unique_ptr thunks, - std::unique_ptr stream_assignment, - const std::vector& hlo_total_order); + std::unique_ptr stream_assignment); // Returns the total order of executing all the thunks. const std::vector& TotalOrder() const { return thunk_total_order_; } diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc index a65096b7eac..3654271da53 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc @@ -226,8 +226,10 @@ absl::string_view LhloDialectEmitter::platform_name() const { return platform_->Name(); } -Status LhloDialectEmitter::EmitComputation(const HloComputation& computation) { - return computation.root_instruction()->Accept(this); +Status LhloDialectEmitter::EmitComputation( + const HloComputation& computation, + absl::Span ordering) { + return computation.AcceptOrdered(this, ordering); } StatusOr LhloDialectEmitter::CreateFunction( diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h index 185c1e13bb7..2fe1947d625 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h +++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h @@ -47,7 +47,8 @@ class LhloDialectEmitter : public DfsHloVisitorWithDefault, ::mlir::ModuleOp mlir_module); ~LhloDialectEmitter() override = default; - Status EmitComputation(const HloComputation& computation); + Status EmitComputation(const HloComputation& computation, + absl::Span ordering); // The following methods implement the DfsHloVisitor interface. // diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc index beabc99a173..a2bee43a0f8 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc @@ -489,7 +489,8 @@ StatusOr> MlirCompilerImpl::RunBackend( stream_exec->platform(), *mlir_module); TF_RETURN_IF_ERROR(lhlo_emitter.EmitComputation( - *emission_context.getHloModule()->entry_computation())); + *emission_context.getHloModule()->entry_computation(), + hlo_schedule->ThunkLaunchOrder())); TF_RETURN_IF_ERROR( module_hook_.invoke(IRHook::LoweringStage::LHLO, *mlir_module)); @@ -539,8 +540,7 @@ StatusOr> MlirCompilerImpl::RunBackend( gpu::PtxOptsFromConfig(config))); auto thunk_schedule = absl::make_unique( - std::move(thunk_sequence), std::move(stream_assignment), - hlo_schedule->ThunkLaunchOrder()); + std::move(thunk_sequence), std::move(stream_assignment)); if (DumpingEnabledForHloModule(*emission_context.getHloModule())) { DumpToFileInDirOrStdout(*emission_context.getHloModule(), "",