[XLA/GPU] Make module parameter to GpuExecutable optional.

If the user goes through normal XLA pipeline, in the end XLA GPU will pass the optimized HLO module for tooling, debugging, and testing support. If the user goes through a separate pipeline (e.g. pure MLIR input), the `debug_module` argument can be nullptr, but none of the HLO tooling support would be there, expectedly. PiperOrigin-RevId: 349470648 Change-Id: I60fa508faa7ddcf2abbd91586ebdca3ac148e0f2
2020-12-29 14:30:02 -08:00 · 2020-12-29 14:30:02 -08:00 · 054c7c1e8e
commit 054c7c1e8e
parent 676c9c4bce
7 changed files with 51 additions and 29 deletions
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -873,13 +873,21 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
      std::make_unique<BufferAssignmentProto>(buffer_assignment->ToProto());
  std::vector<BufferAllocation> allocations =
      buffer_assignment->ReleaseAllocations();
+  std::string module_name = module->name();
+  Shape output_shape = module->entry_computation()->root_instruction()->shape();
+  size_t profile_index = 0;
+  if (profile_index_map) {
+    profile_index =
+        profile_index_map->GetProfileIndexFor(*module->entry_computation());
+  }

  GpuVersion gpu_version = GetGpuVersion(stream_exec);
  auto* gpu_executable = new GpuExecutable(
      {std::move(backend_result.first), std::move(backend_result.second),
       gpu_version, std::move(thunk_schedule), std::move(constants),
-       std::move(output_info), std::move(module), std::move(allocations),
-       std::move(buffer_assignment_proto), std::move(profile_printer),
+       std::move(output_info), module_name, output_shape,
+       std::move(allocations), std::move(buffer_assignment_proto),
+       std::move(module), profile_index, std::move(profile_printer),
       std::move(profile_index_map)});
  if (embed_ir_in_executable) {
    DCHECK_NE("", ir_module_string_before_opt);
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -55,25 +55,26 @@ using ::tensorflow::profiler::ScopedAnnotation;
 // Implementation note: HLO profiling is always enabled for GPU executables,
 // since we can use timers around thunks.
 GpuExecutable::GpuExecutable(GpuExecutable::Params params)
-    : Executable(std::move(params.hlo_module),
+    : Executable(std::move(params.debug_module),
                 std::move(params.hlo_profile_printer_data),
                 std::move(params.hlo_profile_index_map)),
      text_(std::move(params.asm_text)),
      binary_(std::move(params.binary)),
      gpu_version_(params.gpu_version),
      thunk_schedule_(std::move(params.thunk_schedule)),
+      module_name_(params.module_name),
+      output_shape_(params.output_shape),
      allocations_(std::move(params.allocations)),
      debug_buffer_assignment_(std::move(params.debug_buffer_assignment)),
+      entry_computation_profile_index_(params.entry_computation_profile_index),
      constants_(std::move(params.constants)),
      output_info_(std::move(params.output_info)) {
-  CHECK(has_module());
-  GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
+  GpuDebugInfoManager::Get()->RegisterModule(module_name_, shared_module(),
                                             debug_buffer_assignment_);
 }

 GpuExecutable::~GpuExecutable() {
-  CHECK(has_module());
-  GpuDebugInfoManager::Get()->UnregisterModule(module().name(), shared_module(),
+  GpuDebugInfoManager::Get()->UnregisterModule(module_name_, shared_module(),
                                               debug_buffer_assignment_);

  {
@ -129,9 +130,9 @@ Status GpuExecutable::ExecuteThunks(
    HloExecutionProfile* hlo_execution_profile) {
  TF_RETURN_IF_ERROR(
      CheckCompatibilityWithServiceExecutableRunOptions(run_options));
-  GpuDebugInfoManager::Get()->OnModuleStart(module().name());
+  GpuDebugInfoManager::Get()->OnModuleStart(module_name_);
  auto cleanup = MakeCleanup(
-      [&]() { GpuDebugInfoManager::Get()->OnModuleStop(module().name()); });
+      [&]() { GpuDebugInfoManager::Get()->OnModuleStop(module_name_); });

  se::Stream* main_stream = run_options->stream();
  se::StreamExecutor* executor = main_stream->parent();
@ -154,11 +155,11 @@ Status GpuExecutable::ExecuteThunks(
  }

  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
-                                sub_streams, hlo_module_->entry_computation());
+                                sub_streams, entry_computation_profile_index_);
  uint64 start_micros = tensorflow::Env::Default()->NowMicros();

  tensorflow::profiler::TraceMe hlo_module_activity(
-      [&] { return absl::StrCat(hlo_module_->name(), ":XLA GPU module"); },
+      [&] { return absl::StrCat(module_name_, ":XLA GPU module"); },
      tensorflow::profiler::TraceMeLevel::kInfo);

  std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
@ -243,9 +244,8 @@ Status GpuExecutable::ExecuteThunks(

    // If hlo profiling was disabled then the cycle count is left empty.
    if (do_profile) {
-      profile->set_compute_cycle_count(
-          hlo_execution_profile->total_cycles_executed(
-              *module().entry_computation()));
+      profile->set_compute_cycle_count(hlo_execution_profile->GetCyclesTakenBy(
+          entry_computation_profile_index_));
    }
  }

@ -396,8 +396,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
    const ServiceExecutableRunOptions* run_options,
    std::vector<ExecutionInput> arguments,
    HloExecutionProfile* hlo_execution_profile) {
-  XLA_SCOPED_LOGGING_TIMER(absl::StrCat("GpuExecutable::ExecuteAsyncOnStream(",
-                                        module().name(), ")"));
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("GpuExecutable::ExecuteAsyncOnStream(", module_name_, ")"));
  se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
  // Force synchronous execution if the allocator requires it.
  const bool block_host_until_done =
@ -414,10 +414,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(

  se::StreamExecutor* executor = run_options->stream()->parent();

-  HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
-  const Shape& root_shape = root->shape();
  auto device_ordinal = executor->device_ordinal();
-  ExecutionOutput result(/*on_device_shape=*/root->shape(), memory_allocator,
+  ExecutionOutput result(/*on_device_shape=*/output_shape_, memory_allocator,
                         device_ordinal);

  TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
@ -482,7 +480,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
        VLOG(3) << "Using copy-protection: aliasing is specified, but the "
                   "buffer is not donated; allocating a fresh buffer";
        int64 allocation_size =
-            ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(root_shape, index));
+            ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(output_shape_, index));
        TF_ASSIGN_OR_RETURN(
            se::OwningDeviceMemory allocated_buffer,
            memory_allocator->Allocate(device_ordinal, allocation_size));
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -74,9 +74,12 @@ class GpuExecutable : public Executable {
    std::unique_ptr<const ThunkSchedule> thunk_schedule;
    std::vector<ConstantInfo> constants;
    absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
-    std::unique_ptr<HloModule> hlo_module;
+    std::string module_name;
+    xla::Shape output_shape;
    std::vector<BufferAllocation> allocations;
    std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment;
+    std::unique_ptr<HloModule> debug_module = nullptr;
+    size_t entry_computation_profile_index = 0;
    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data = nullptr;
    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map = nullptr;
  };
@ -179,12 +182,18 @@ class GpuExecutable : public Executable {
  // IrEmitter.
  const std::unique_ptr<const ThunkSchedule> thunk_schedule_;

+  std::string module_name_;
+
+  xla::Shape output_shape_;
+
  // Owns the buffer data at runtime. It provides information to allocate
  // memory for every output/temp buffers.
  const std::vector<BufferAllocation> allocations_;

  std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_;

+  size_t entry_computation_profile_index_ = -1;
+
  // Cache of module handles and constant buffer allocation maps used by
  // `ResolveConstantGlobals`.
  tensorflow::mutex module_handle_mutex_;
--- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
@ -52,13 +52,12 @@ uint64 GetCyclesTaken(std::stack<std::unique_ptr<se::Timer>>* timers,

 HloExecutionProfiler::HloExecutionProfiler(
    bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
-    const std::vector<StreamPool::Ptr>& sub_streams,
-    const HloComputation* computation)
+    const std::vector<StreamPool::Ptr>& sub_streams, size_t index)
    : do_profile_(do_profile),
      profile_(profile),
      stream_(stream),
      sub_streams_(sub_streams),
-      computation_(computation) {
+      computation_profile_index_(index) {
  if (do_profile_) {
    clock_rate_ghz_ = stream->parent()->GetDeviceDescription().clock_rate_ghz();
    InitAndStartTimer(&timers_, stream);
@ -69,8 +68,8 @@ void HloExecutionProfiler::FinishExecution() {
  CHECK(!finished_execution_) << "Call FinishExecution only once!";
  finished_execution_ = true;
  if (do_profile_) {
-    profile_->set_total_cycles_executed(
-        *computation_,
+    profile_->SetCyclesTakenBy(
+        computation_profile_index_,
        GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
  }
 }
--- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
@ -40,7 +40,7 @@ class HloExecutionProfiler {
  explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
                                se::Stream* stream,
                                const std::vector<StreamPool::Ptr>& sub_streams,
-                                const HloComputation* computation);
+                                size_t index);

  // If profiling is enabled, sets the total cycle count on the profile from the
  // execution timer.
@ -80,7 +80,7 @@ class HloExecutionProfiler {
  HloExecutionProfile* profile_;
  se::Stream* stream_;
  const std::vector<StreamPool::Ptr>& sub_streams_;
-  const HloComputation* computation_;
+  size_t computation_profile_index_;
  std::stack<std::unique_ptr<se::Timer>> timers_;
  // Contains the HLO instructions for which we are currently measuring the
  // time.
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@ -142,7 +142,11 @@ void HloExecutionProfile::SetCyclesTakenBy(size_t index, uint64 cycles_taken) {
 }

 uint64 HloExecutionProfile::GetCyclesTakenBy(const HloInstruction& hlo) const {
-  return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(hlo)];
+  return GetCyclesTakenBy(hlo_profile_index_map_.GetProfileIndexFor(hlo));
+}
+
+uint64 HloExecutionProfile::GetCyclesTakenBy(size_t index) const {
+  return profile_counters_[index];
 }

 HloExecutionProfileData HloExecutionProfile::ToProto() const {
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@ -121,6 +121,10 @@ class HloExecutionProfile {
  // may not be available for some instructions in which case zero is returned.
  uint64 GetCyclesTakenBy(const HloInstruction& hlo) const;

+  // Returns how many cycles this HLO took to execute.  Profiling information
+  // may not be available for some instructions in which case zero is returned.
+  uint64 GetCyclesTakenBy(size_t index) const;
+
  // Return the number of cycles this computation took to execute.
  uint64 total_cycles_executed(const HloComputation& computation) const {
    return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(